
clear all
local user "awcassidy1"
*DO SMARTY STREETS BETWEEN CLEAN1A and CLEAN1B


********************************************************************************
*Electricity Data
********************************************************************************
import delimited "C:\Users/`user'\Dropbox\jmp_new\Austin Electricity and Gas Statistics\Residential_Average_Monthly_kWh_and_Bills.csv", ///
	clear

gen datemonthstarting=date(monthstarting, "MDYhm")

gen salemonth=month(datemonthstarting)
gen saleyear=year(datemonthstarting)

drop datemonthstarting monthstarting

rename fuel* austinelec 

keep salemonth saleyear austinelec

sort saleyear salemonth, stable

save "C:\Users/`user'\Dropbox\jmp_new\Austin Electricity and Gas Statistics\elec.dta", replace

********************************************************************************
*Gas Data
********************************************************************************
import excel "C:\Users/`user'\Dropbox\jmp_new\Austin Electricity and Gas Statistics\nat gas.xls", ///
	sheet("Data 1") firstrow cellrange(A3:B344) clear
	
gen salemonth=month(Date)
gen saleyear=year(Date)

rename Texas* texasgas

drop Date

sort saleyear salemonth, stable

save "C:\Users/`user'\Dropbox\jmp_new\Austin Electricity and Gas Statistics\nat gas.dta", replace

********************************************************************************
*Convert market hotness data to stata format
********************************************************************************
import excel "C:\Users\awcassidy1\Dropbox\jmp_new\market_hotness\market_hotness.xlsx", clear firstrow
rename year saleyear
destring saleyear, replace
gen salemonth=month(date(month,"M"))
drop month
save "C:\Users\awcassidy1\Dropbox\jmp_new\market_hotness\market_hotness.dta", replace

********************************************************************************
*read in audit data and save to stata format.
********************************************************************************
import delimited "C:\Users\awcassidy1\Dropbox\jmp_new\ss_output/audit_2_ss-output.csv", clear
save "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/audit_2_ss-output.dta", replace

********************************************************************************
*inflation data: add variables, sort, and save to stata format.
********************************************************************************
import delimited "C:\Users/awcassidy1\Dropbox\jmp_new/inflation_data\CPIAUCSL.csv", clear

gen cpid=dofc(clock(date, "MDY"))
gen salemonth = month(cpid)
gen saleyear=year(cpid)
drop cpid

sort saleyear salemonth

save "C:\Users/awcassidy1\Dropbox\jmp_new/inflation_data\CPIAUCSL.dta", replace

********************************************************************************
*now merge in
*It needs to be unique for each delivery point barcode.
********************************************************************************
use "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/audit_2.dta", clear

gen double auditd = dofc(clock(auditdate, "MD20Yhm"))

sort auditd

rename address1 street
gen state="TX"
gen city="Austin"
destring zip,replace
rename latitude orig_lat
rename longitude orig_long

merge m:1 street city state zip using "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/audit_2_ss-output.dta"
drop if regexm(summary, "No Match")>0

rename yearbuilt yearbuiltaudit

replace programmablethermostatpresent="1" if programmablethermostatpresent=="Y"
replace programmablethermostatpresent="0" if programmablethermostatpresent=="N"

destring programmablethermostatpresent, replace
*programmable thermostat present's name too long, rename it progtherm

rename programmablethermostatpresent progtherm


*generate some variables that are the original variables before combining
*categories, so that we can talk about combinations of categories in final
*dataset.

gen osystem1airhandlertype=system1airhandlertype
gen owaterheatertanktype= waterheatingtanktype
gen oductsystem1type= ductsystem1type

* i am making the categorical values manually

********************************************************************************
*Duct return sizing adequate y/n
********************************************************************************
replace ductsystem1returnsizing="1" if ductsystem1returnsizing=="Adequate"
replace ductsystem1returnsizing="0" if ductsystem1returnsizing=="Inadequate"
la var ductsystem1returnsizing "Duct System 1 Return Sizing Adequate"

destring ductsystem1returnsizing, replace

replace ductsystem2returnsizing="1" if ductsystem2returnsizing=="Adequate"
replace ductsystem2returnsizing="0" if ductsystem2returnsizing=="Inadequate"
la var ductsystem2returnsizing "Duct System 2 Return Sizing Adequate"

destring ductsystem2returnsizing, replace

********************************************************************************
*System1location air handler attic=1
*Not enough garage so combined those.
********************************************************************************

replace system1locationairhandler="1" if system1locationairhandler=="Attic"
replace system1locationairhandler="0" if system1locationairhandler=="Closet" | ///
	system1locationairhandler=="Garage" 

destring system1locationairhandler, replace

********************************************************************************
*System2location air handler attic=1
*Not enough garage so combined those.
********************************************************************************

replace system2locationairhandler="1" if system2locationairhandler=="Attic"
replace system2locationairhandler="0" if system2locationairhandler=="Closet" | ///
	system2locationairhandler=="Garage" 

destring system2locationairhandler, replace

********************************************************************************
*Furnace fuel type gas=1
********************************************************************************
replace furnacefueltype="1" if furnacefueltype=="Gas"
replace furnacefueltype="0" if furnacefueltype=="Elec"

destring furnacefueltype, replace


********************************************************************************
*water heater fuel type gas=1
********************************************************************************
replace waterheaterfueltype="1" if waterheaterfueltype=="Gas"
replace waterheaterfueltype="0" if waterheaterfueltype=="Elec"

destring waterheaterfueltype, replace


********************************************************************************
*Tabulate some variables
********************************************************************************
foreach v  in system1airhandlertype system2airhandlertype waterheatingtanktype ///
	ductsystem1type ductsystem2type {
	di "`v'"
	tab `v'
	}

********************************************************************************
*System1 air handler type vertical=1
********************************************************************************

replace system1airhandlertype="1" if system1airhandlertype=="Down Flow" ///
	| system1airhandlertype=="Up Flow" & ///
	!missing(system1airhandlertype)
replace system1airhandlertype="0" if system1airhandlertype=="Horizontal"

destring system1airhandlertype, replace

********************************************************************************
*System2 air handler type vertical=1
********************************************************************************
replace system2airhandlertype="1" if system2airhandlertype=="Down Flow" ///
	| system2airhandlertype=="Up Flow" & ///
	!missing(system2airhandlertype)
replace system2airhandlertype="0" if system2airhandlertype=="Horizontal"

destring system2airhandlertype, replace


********************************************************************************
*Water Heater Tank Type tankless=1
********************************************************************************

rename waterheatingtanktype waterheatertanktype

replace waterheatertanktype = "1" if waterheatertanktype=="Tankless" | waterheatertanktype=="Solar"
replace waterheatertanktype = "0" if waterheatertanktype=="Tank"

destring waterheatertanktype, replace

********************************************************************************
*Duct system 1 type duct board or sheet metal =1
********************************************************************************
replace ductsystem1type = "0" if ductsystem1type=="Grey_Flex" | ductsystem1type=="Mylar_Flex" | ductsystem1type=="Duct_Board"
replace ductsystem1type = "1" if ductsystem1type=="Sheet_Metal"

destring ductsystem1type, replace

********************************************************************************
*Duct system 2 type duct board or sheet metal = 1
********************************************************************************
replace ductsystem2type = "0" if ductsystem2type=="Grey_Flex" | ductsystem2type=="Mylar_Flex" | ductsystem2type=="Duct_Board" 
replace ductsystem2type = "1" if ductsystem2type=="Sheet_Metal"

destring ductsystem2type, replace


replace ductsystem1type=ductsystem2type if missing(ductsystem1type)

********************************************************************************
*system 1 exists y/n
********************************************************************************

egen sys1exists=rownonmiss(ductsystem1type ductsystem1rvalue ///
	ductsystem1returnsizing ductsystem1leakage system1locationairhandler ///
	system1airhandlertype system1eer system1sqftton)
replace sys1exists=1 if sys1exists >=1

egen sys2exists=rownonmiss(ductsystem2type ductsystem2rvalue ///
	ductsystem2returnsizing ductsystem2leakage system2locationairhandler system2airhandlertype ///
	 system2eer system2sqftton)
replace sys2exists=1 if sys2exists >=1

gen twosystems=0
replace twosystems=1 if sys1exists & sys2exists

********************************************************************************
*TAB TWO SYSTEMS
tab twosystems
********************************************************************************


egen eeravg=rowmean(system1eer system2eer)
egen leakavg=rowmean(ductsystem1leakage ductsystem2leakage)
gen negleakavg=-1*leakavg
egen ductravg=rowmean(ductsystem1rvalue ductsystem2rvalue)
egen sysageavg=rowmean(system1ageyears system2ageyears)
gen negsysageavg=-1*sysageavg
egen rsavg=rowmean(ductsystem1returnsizing ductsystem2returnsizing)
egen sizeavg=rowmean(system1sqftton system2sqftton)
rename windowscreenarearecommendedsqft winrec
rename recommendedadditionalrvalue rrec

*want to think of not window recommendation because want all energy efficiency
*vars to go in one direction.
gen notwinrec = 0 if !missing(winrec)
replace notwinrec=1 if winrec==0

gen atticrvalue2=atticrvalue^2
gen negleakavg2=negleakavg^2
gen eeravg2=eeravg^2
gen ductravg2=ductravg^2
gen negsysageavg2=negsysageavg^2
gen sizeavg2=sizeavg^2
gen rsavg2=rsavg^2



*list of audit vars of interest
local auditvarsofinterest progtherm yearbuiltaudit ///
eeravg negleakavg ductravg negsysageavg rsavg sizeavg winrec rrec atticrvalue2 negleakavg2 eeravg2 ///
ductravg2 negsysageavg2 sizeavg2 rsavg2 twosystems ductsystem1type system1airhandlertype ///
waterheaterfueltype system1locationairhandler ///
furnacefueltype conditionedsqft atticsqft atticrvalue bedrooms waterheatertanktype ///
 notwinrec 


forval i=1/5 {
foreach v in `auditvarsofinterest' {
	bysort delivery_point_barcode: replace `v'=`v'[_n-1] if  missing(`v')
	bysort delivery_point_barcode: replace `v'=`v'[_n+1] if  missing(`v')
	}
	}
	

forval i=1/5 {
foreach v of varlist o* {
	bysort delivery_point_barcode: replace `v'=`v'[_n-1] if  missing(`v')
	bysort delivery_point_barcode: replace `v'=`v'[_n+1] if  missing(`v')
	}
	}


	
drop zip_type record_type
*now collapse each variable by delivery point.
collapse `auditvarsofinterest' (min) auditd , by(delivery_point_barcode home_type osystem1airhandlertype owaterheatertanktype oductsystem1type)

duplicates drop delivery_point_barcode, force

replace yearbuiltaudit=round(yearbuiltaudit, 1)

save "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/audit_3.dta", replace

*done cleaning audit data.


*read in res data for review
import delimited "C:\Users\awcassidy1\Dropbox\jmp_new\ss_output/res_all_2_ss-output.csv", clear
save "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/res_all_2_ss-output.dta", replace
*now merge into mls.

use "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/res_all_2.dta", clear
rename address street
rename state_ state
rename latitude lat_mls
rename longitude long_mls
merge m:1 street city state zip using "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/res_all_2_ss-output.dta"

*now I need to merge in by delivery point.
*first make delivery point missing if no match
replace delivery_point_barcode=. if regexm(summary, "No Match")>0

drop _merge

*merge in audit data. It's at building level, so don't use secondary number (apt number for example).

merge m:1 delivery_point_barcode using "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/audit_3.dta"
drop if _merge==2

replace secondary_number="1" if missing(secondary_number)

*make an id number that has secondary number and delivery point barcode
egen mlsid=group(delivery_point_barcode secondary_number)

*now count sales by delivery_point_barcode

egen sales_by_mlsid=count(sold_lease_date) if !missing(delivery_point_barcode), by(mlsid)

count if sales_by_mlsid==2 & _merge==3

*make a variable indicating whether it was a home that was audited
gen sample_a=0
replace sample_a=1 if _merge==3

tab sales_by_mlsid if sample_a

drop sales_by_mlsid

drop _merge

save "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/combined_1.dta", replace


*permits- get to stata format.
import delimited "C:\Users\awcassidy1\Dropbox\jmp_new\ss_output/permits_ss-output.csv", clear
save "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/permits_ss-output.dta", replace

*permit data cleaning
import delimited "C:\Users\awcassidy1\Dropbox\jmp_new\permit_data/Issued_Construction_Permits.csv", clear
rename originaladdress1 street
rename originalcity city
rename originalzip zip
rename originalstate state

merge m:1 street city zip state using "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/permits_ss-output.dta"
keep if permitclassmapped=="Residential"

drop if regexm(summary, "No Match")>0


*first see whether or not we can use the actual amounts paid:

destring buildingvaluation buildingvaluationremodel electricalvaluation ///
	electricalvaluationremodel mechanicalvaluation mechanicalvaluationremodel ///
	plumbingvaluation, replace ignore($)

egen tjv= rowtotal(buildingvaluation buildingvaluationremodel ///
	electricalvaluation electricalvaluationremodel mechanicalvaluation ///
	mechanicalvaluationremodel plumbingvaluation)

*looks like the various components don't always add up to the total valuation of the job.
*bummer. Can't really do this at a job level unless they only did one thing. Even then,
*might not be reliable. Can really only look at prob of doing something.

replace completeddate=expiresdate if missing(completeddate) & statuscurrent=="Final"

*make the secondary number 1 if it is missing, this way it will match the combined data that we are going
*to merge it into.
replace secondary_number="1" if missing(secondary_number)


gen permitd=date(completeddate,"YMD")
keep permitd delivery_point_barcode secondary_number

*now make the permit date missing if it's before our period, so it could not possibly
*be in between sales. We don't want to delete because we need to know which
*addresses match permit data.
replace permitd=. if permitd<date("2000/1/1","YMD") 

*now make a permit number variable by home so that the data will have one home
*for each line and display multiple permit dates.
duplicates drop
sort delivery_point_barcode secondary_number permitd

by delivery_point_barcode secondary_number: gen permitnum=_n


*now reshape the data so it just has all the permit dates for a particular home.
reshape wide permitd, i(delivery_point_barcode secondary_number) j(permitnum)
save "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/permit_dates.dta", replace

*now, keep sold properties

use "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/combined_1.dta", clear
*make list date variable.
gen double listd = dofc(clock(list_date, "MD20Yhm"))
*before we restrict only to homes that were sold, record old listings that did not sell.
sort mlsid listd
bysort mlsid: gen listseq=_n


keep if !missing(sold_lease_date)
destring sold_lease_price, gen(price)
gen double saled = dofc(clock(sold_lease_date, "MD20Yhm"))

gen saleyear=year(saled)
gen qrtr=qofd(saled)
gen salemonth =month(saled)

merge m:1 saleyear salemonth using "C:\Users\awcassidy1\Dropbox\jmp_new/inflation_data\CPIAUCSL.dta", nogenerate keep(3)

merge m:1 saleyear salemonth using "C:\Users\awcassidy1\Dropbox\jmp_new\market_hotness\market_hotness.dta", nogenerate keep(3)


*convert to march 2019 dollars
replace price=price*(254.14799/cpiaucsl)

*convert list price as well.
destring list_price, replace
replace list_price=list_price*(254.14799/cpiaucsl)



*audited variable.
gen audited= 0
replace audited=1 if auditd<=saled

*complier variable. Noncomplier if they sold a home after the date they were
*supposed to without getting audit.

gen condo=0 if !missing(home_type)
replace condo=1 if home_type=="Condo"




destring year_built, replace

replace yearbuiltaudit=year_built if missing(yearbuiltaudit)

*make compliance deadline
gen comply_deadline= date("6/1/2009", "MDY") if !condo
replace comply_deadline= date("6/1/2011", "MDY") if condo==1


forval i=0/9 {
	replace comply_deadline=date("6/1/201`i'", "MDY") if condo==0 & yearbuiltaudit==200`i'
	}
	
forval i=2/9 {	
		replace comply_deadline=date("6/1/201`i'", "MDY") if condo==1 & yearbuiltaudit==200`i'
	}
	
*for homes with year built >2009, just make their compliance deadline today. They will be dropped.
replace comply_deadline=date("5/1/2019", "MDY") if yearbuiltaudit>2009
	
*non-complier is defined as someone who complied late for purpose of this code.
gen noncomplier=0
replace noncomplier=1 if comply_deadline<=saled & audited==0

gen posthouse=0
replace posthouse=1 if saled>date("6/1/2009","MDY")


gen postcondo=0
replace postcondo=1 if saled>date("6/1/2011","MDY")

foreach v in atticrvalue progtherm eeravg ductravg negleakavg {
	gen posth_`v' = posthouse*`v'
	gen postc_`v' = postcondo*`v'
	}

*make a variable that has the number of sales after first audit
egen sales_after_audit=sum(audited), by(mlsid)

*make a variable that has the number of sales before compliance deadline and before
*any audits
gen before_comply_deadline= 0
replace before_comply_deadline=1 if saled<comply_deadline & audited==0
egen sales_before_comply_deadline = sum(before_comply_deadline), by(mlsid)

browse mlsid saled auditd saleyear audited sales_before_comply_deadline sales_after_audit

*make a variable for early audit
gen early_audit =0
replace early_audit=1 if saled<comply_deadline & audited==1
gen days_early=comply_deadline-saled
replace days_early=. if early_audit==0
*124 early audits.
drop if days_early>60 & !missing(days_early)

browse mlsid days_early saled auditd saleyear yearbuilt audited sales_before_comply_deadline sales_after_audit if early_audit


*drop if no sales after audit
drop if sales_after_audit==0

drop if sales_before_comply_deadline == 0

tab sales_before_comply_deadline

sort mlsid saled
by mlsid: gen salenum=_n

*now get the relative sale num. For this, -1 is the sale just before the audit, 0 is the sale just after the
*audit
gen relsalenum=salenum-sales_before_comply_deadline-1

sort mlsid relsalenum
*now get the price diff and make it nonmissing only for relsalenum=0
xtset mlsid relsalenum

gen pricediff=price-l.price if relsalenum==0

browse mlsid relsalenum price saleyear pricediff 

gen lsaleyear=l.saleyear if !missing(pricediff)


gen yearbuiltcat=0 if yearbuiltaudit<1999
replace yearbuiltcat=yearbuiltaudit if yearbuiltaudit>=1999




gen post=0
replace post=1 if relsalenum>=0 & !missing(relsalenum)

gen post_negleakavg=post*negleakavg


*for looking at prior sales.
sort mlsid relsalenum
xtset mlsid relsalenum

gen pricediff_baseline=price-l.price if relsalenum<0 & !missing(relsalenum)
gen lsaleyear_baseline=l.saleyear if !missing(pricediff_baseline)

*rescale price to be in 10,000's
replace pricediff=pricediff/10000
replace pricediff_baseline=pricediff_baseline/10000

save "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/combined_3.dta", replace

use "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/combined_3.dta", clear

*merge in the permits
replace secondary_number="1" if missing(secondary_number)
merge m:1 delivery_point_barcode secondary_number using ///
	"C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/permit_dates.dta", keep(1 3)
	
gen matchedpermitdb=0 
replace matchedpermitdb=1 if _merge==3

drop _merge

*no matches have over 20 permits.

*make a lagged sale date so we know if the permitted improvement happened between
*sales.
sort mlsid relsalenum

xtset mlsid relsalenum
gen lsaled= l.saled
duplicates drop sold_lease_price mlsid saled, force

*annualize the price difference
gen dbs=saled-lsaled
gen an_pd=(pricediff/dbs)*365

gen permitbetweensales=0 if !missing(pricediff) & matchedpermitdb==1
*loop through permit date #s to see if any are between the 2 sales.
forval i=1/31 {
	replace permitbetweensales= 1 if !missing(permitd`i') & permitd`i'>=lsaled & permitd`i'<=saled & !missing(lsaled) & !missing(pricediff)
	}
sum permitbetweensales if !missing(pricediff)
*looks like it affects 26 percent of sales, roughly.

drop permitd*

save "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/combined_4.dta", replace


use "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/combined_4.dta", clear

sort mlsid relsalenum

xtset mlsid relsalenum

gen laggedprice=l.price if !missing(pricediff)

gen lpricediff= log(price) - log(laggedprice)

la var negleakavg "$-$ \% Duct Leakage"
la var pricediff "$ \Delta $ Price"
la var lpricediff "$ \Delta $ Log(Price)"


la var waterheatertanktype "Tankless or Solar WH"
la var ductsystem1type "Metal Ducts"
la var twosystems "2 Systems"
la var furnacefueltype "Gas Furnace"
la var system1airhandlertype "Vertical AH"
la var waterheaterfueltype "Gas WH"
la var system1locationairhandler "AH in Closet"

la var price "Price" 
la var eeravg "EER"
la var negleakavg "$-$ \% Duct Leakage"
la var atticrvalue "Attic R-value"
la var audited "Audited"

la var rsavg "Return Sizing Adequate"
la var ductravg "Duct R-value"

la var negsysageavg "$-$ System Age"

la var sizeavg "HVAC Size (sqft/ton)"
la var notwinrec "Did Not Rec WS"

la var progtherm "Programmable Thermostat"
la var rrec "Amount of Recommended Additional R-value"



destring num_beds_total num_baths_total , force replace
gen pool=0 if !missing(pool_on_property)
replace pool=1 if pool_on_property=="TRUE"


codebook subdivision
*clean up subdivision
replace subdivision =upper(subdivision)
forval i=1/9 {
	replace subdivision = subinstr(subdivision,"0`i'","`i'",.)
	
	}

replace subdivision = subinstr(subdivision,"PHS1A","PHS 1A",.)	

	
replace subdivision = subinstr(subdivision,"-","",.)
replace subdivision = subinstr(subdivision,"'","",.)

replace subdivision = subinstr(subdivision,"&","AND",.)

replace subdivision = subinstr(subdivision,"ADDN","ADD",.)
replace subdivision = subinstr(subdivision,"ADDITION","ADD",.)
replace subdivision = subinstr(subdivision,"AMMENDED","AMD",.)
replace subdivision = subinstr(subdivision,"AMENDED","AMD",.)
replace subdivision = subinstr(subdivision,"AME","AMD",.)
replace subdivision = subinstr(subdivision,"*"," ",.)
replace subdivision = subinstr(subdivision,"/"," ",.)


replace subdivision = subinstr(subdivision,"  "," ",.)
replace subdivision = subinstr(subdivision,"THE ","",.)
replace subdivision = subinstr(subdivision," THE","",.)
replace subdivision = subinstr(subdivision,","," ",.)
replace subdivision = subinstr(subdivision," I A","1A",.)
replace subdivision = subinstr(subdivision," II A","2A",.)
replace subdivision = subinstr(subdivision," I "," 1 ",.)
replace subdivision = subinstr(subdivision," II "," 2 ",.)
replace subdivision = subinstr(subdivision," III "," 3 ",.)
replace subdivision = subinstr(subdivision," IV "," 4 ",.)
replace subdivision = subinstr(subdivision," V "," 5 ",.)
replace subdivision = subinstr(subdivision," VI "," 6 ",.)
replace subdivision=strtrim(subdivision)

replace subdivision = subinstr(subdivision," VI"," 6",.)
replace subdivision = subinstr(subdivision," IV"," 4",.)
replace subdivision = subinstr(subdivision," V"," 5",.)
replace subdivision = subinstr(subdivision," III"," 3",.)
replace subdivision = subinstr(subdivision," II"," 2",.)

replace subdivision = subinstr(subdivision," I"," 1",.)

replace subdivision = subinstr(subdivision,"CONDOMINIUMS","CONDO",.)
replace subdivision = subinstr(subdivision,"CONDOS","CONDO",.)
replace subdivision = subinstr(subdivision,"CONDOMINIUM","CONDO",.)
replace subdivision = subinstr(subdivision,"CONDOMIN","CONDO",.)

forval i=1/9 {
	replace subdivision = subinstr(subdivision,"SEC`i'A","SEC `i' A",.)
	replace subdivision = subinstr(subdivision,"SEC`i'B","SEC `i' B",.)
	replace subdivision = subinstr(subdivision,"SEC`i'C","SEC `i' C",.)
	replace subdivision = subinstr(subdivision,"SEC`i'D","SEC `i' D",.)
	replace subdivision = subinstr(subdivision,"SEC`i'E","SEC `i' E",.)
	replace subdivision = subinstr(subdivision,"NO`i'A","NO `i' A",.)
	replace subdivision = subinstr(subdivision,"NO`i'B","NO `i' B",.)
	replace subdivision = subinstr(subdivision,"NO`i'C","NO `i' C",.)
	replace subdivision = subinstr(subdivision,"NO`i'D","NO `i' D",.)
	replace subdivision = subinstr(subdivision,"NO`i'E","NO `i' E",.)

	}







replace subdivision = subinstr(subdivision," VLY"," VALLEY",.)
replace subdivision = subinstr(subdivision,"@"," AT ",.)
replace subdivision = subinstr(subdivision," OF "," ",.)
replace subdivision = subinstr(subdivision,"("," ",.)
replace subdivision = subinstr(subdivision,")"," ",.)


replace subdivision = subinstr(subdivision,"WOODCRESTA","WOODCREST A",.)
replace subdivision = subinstr(subdivision,"PK","PARK",.)	

replace subdivision = subinstr(subdivision,".","",.)	

forval i=1/9 {
	replace subdivision = subinstr(subdivision,"`i' A","`i'A",.)	
	replace subdivision = subinstr(subdivision,"`i' B","`i'B",.)	
	replace subdivision = subinstr(subdivision,"`i' C","`i'C",.)	
	replace subdivision = subinstr(subdivision,"`i' D","`i'D",.)	
	replace subdivision = subinstr(subdivision,"`i' E","`i'E",.)	

	}
	replace subdivision = subinstr(subdivision,"TWNHMS","TOWNHOMES",.)
	replace subdivision = subinstr(subdivision,"SECTION","SEC",.)
	
	replace subdivision = subinstr(subdivision,"SEC","",.)	


replace subdivision = subinstr(subdivision," TH"," ",.)
replace subdivision = subinstr(subdivision,"SUBD","",.)
replace subdivision = subinstr(subdivision,"SUBDIVISION","",.)

replace subdivision = subinstr(subdivision," PHS "," PHASE ",1)
replace subdivision = subinstr(subdivision," PH "," PHASE ",1)

replace subdivision = subinstr(subdivision," NO "," ",1)

	replace subdivision=strtrim(subdivision)

forval i=1/9 {
	replace subdivision = subinstr(subdivision,"  "," ",.)
	}

codebook subdivision
tab subdivision


destring lat_mls long_mls, replace

geodist lat_mls long_mls latitude longitude, miles gen(dist)
destring acres, replace
*the only reason a lat long would be off by so much is if large amounts of land.
*640 acres in one mile. So we should not tolerate a dist of<2 with less than 640 acres
*of land.
drop if dist>2 & acres<640 & !missing(acres) & !missing(dist)

save "C:\Users\awcassidy1\Dropbox\jmp_new\cleaned_data/combined_5.dta", replace
